%load_ext autoreload
%autoreload 2Plotting for results
This notebook produces all results plots. It generates some gap in the data, fill with a method (filter, MDS …), compute metrics and then makes all relevant plots
import altair as altfrom meteo_imp.kalman.results import *
from meteo_imp.data import *
from meteo_imp.utils import *
import pandas as pd
import numpy as np
from pyprojroot import here
import torch
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import SVG, Image
from meteo_imp.kalman.results import _plot_timeseries, _get_labels
from functools import partial
from contextlib import redirect_stderr
import io
import polars as pl
from fastai.vision.data import get_grid
import cairosvgthe generation of a proper pdf is complex as altair_render doesn’t support XOffset, so plotsare first renderder to svg using vl-convert and then to pdf using cairosvg. However this last methods doesn’t support negative numbers …
Due to the high number of samples also cannot use the browser render in the notebook so using vl-convert to a png for the visualization in the notebook
import vl_convert as vlc
from pyprojroot import here
base_path_img = here("manuscript/Master Thesis - Evaluation of Kalman filter for meteorological time series imputation for Eddy Covariance applications - Simone Massaro/images/")
base_path_tbl = here("manuscript/Master Thesis - Evaluation of Kalman filter for meteorological time series imputation for Eddy Covariance applications - Simone Massaro/tables/")
base_path_img.mkdir(exist_ok=True), base_path_tbl.mkdir(exist_ok=True)
def save_show_plot(plot,
path,
altair_render=False # use altair render for pdf?
):
plt_json = plot.to_json()
if not altair_render:
svg_data = vlc.vegalite_to_svg(vl_spec=plt_json)
with open(base_path_img / (path + ".svg"), 'w') as f:
f.write(svg_data)
cairosvg.svg2pdf(file_obj=open(base_path_img / (path + ".svg")), write_to=str(base_path_img / (path + ".pdf")))
else:
with redirect_stderr(io.StringIO()):
plot.save(base_path_img / (path + ".pdf"))
# render to image for displaying in notebook
png_data = vlc.vegalite_to_png(vl_spec=plot.to_json(), scale=1)
return Image(png_data)reset_seed()
n_rep = 500hai = pd.read_parquet(hai_big_path).reindex(columns=var_type.categories)
hai_era = pd.read_parquet(hai_era_big_path)alt.data_transformers.disable_max_rows() # it is safe to do so as the plots are rendered using vl-convert and then showed as imagesDataTransformerRegistry.enable('default')
Correlation
import matplotlib.pyplot as pltimport statsmodels.api as smdef auto_corr_df(data, nlags=96):
autocorr = {}
for col in data.columns:
autocorr[col] = sm.tsa.acf(data[col], nlags=nlags)
return pd.DataFrame(autocorr)auto_corr = auto_corr_df(hai).reset_index(names="gap_len").melt(id_vars="gap_len")
auto_corr.gap_len = auto_corr.gap_len / 2auto_corr| gap_len | variable | value | |
|---|---|---|---|
| 0 | 0.0 | TA | 1.000000 |
| 1 | 0.5 | TA | 0.998595 |
| 2 | 1.0 | TA | 0.995814 |
| 3 | 1.5 | TA | 0.992141 |
| 4 | 2.0 | TA | 0.987630 |
| ... | ... | ... | ... |
| 868 | 46.0 | TS | 0.959680 |
| 869 | 46.5 | TS | 0.961116 |
| 870 | 47.0 | TS | 0.962085 |
| 871 | 47.5 | TS | 0.962551 |
| 872 | 48.0 | TS | 0.962480 |
873 rows × 3 columns
p = (alt.Chart(auto_corr).mark_line().encode(
x = alt.X('gap_len', title="Gap length [h]", axis = alt.Axis(values= [12, 24, 36, 48])),
y = alt.Y("value", title="correlation"),
color=alt.Color("variable", scale=meteo_scale, title="Variable"),
facet =alt.Facet('variable', columns=3, sort = meteo_scale.domain, title=None,
header = alt.Header(labelFontWeight="bold", labelFontSize=20))
)
.properties(height=120, width=250)
.resolve_scale(y='independent', x = 'independent')
.pipe(plot_formatter))
save_show_plot(p, "temporal_autocorrelation")
axes = get_grid(1,1,1, figsize=(10,8))
sns.set(font_scale=1.25)
sns.heatmap(hai.corr(), annot=True, vmin=-1, vmax=1, center=0,
cmap=sns.diverging_palette(20, 220, n=200), ax=axes[0], square=True, cbar=True)
# axes[0].set(xlabel="Variable", ylabel="Variable", title="Inter-variable Correlation");
size_old = plt.rcParams["axes.labelsize"]
plt.rcParams["axes.labelsize"] = 30
plt.tight_layout()
plt.savefig(base_path_img / "correlation.pdf")
plt.rcParams["axes.labelsize"] = size_old
Comparison Imputation methods
base_path = here("analysis/results/trained_models")def l_model(x, base_path=base_path): return torch.load(base_path / x)models_var = pd.DataFrame.from_records([
{'var': 'TA', 'model': l_model("TA_specialized_gap_6-336_v3_0.pickle",base_path)},
{'var': 'SW_IN', 'model': l_model("SW_IN_specialized_gap_6-336_v2_0.pickle",base_path)},
{'var': 'LW_IN', 'model': l_model("LW_IN_specialized_gap_6-336_v1.pickle",base_path)},
{'var': 'VPD', 'model': l_model("VPD_specialized_gap_6-336_v2_0.pickle",base_path)},
{'var': 'WS', 'model': l_model("WS_specialized_gap_6-336_v1.pickle",base_path)},
{'var': 'PA', 'model': l_model("PA_specialized_gap_6-336_v3_0.pickle",base_path)},
{'var': 'P', 'model': l_model("1_gap_varying_6-336_v3.pickle",base_path)},
{'var': 'TS', 'model': l_model("TS_specialized_gap_6-336_v2_0.pickle",base_path)},
{'var': 'SWC', 'model': l_model("SWC_specialized_gap_6-336_v2_1.pickle",base_path)},
])@cache_disk(cache_dir / "the_results")
def get_the_results(n_rep=20):
reset_seed()
comp_Av = ImpComparison(models = models_var, df = hai, control = hai_era, block_len = 446, time_series=False)
results_Av = comp_Av.compare(gap_len = [12,24, 48, 336], var=list(hai.columns), n_rep=n_rep)
return results_Av
results_Av = get_the_results(n_rep)State of the art
the first plot is a time series using only state-of-the-art methods
reset_seed()
comp_ts = ImpComparison(models = models_var, df = hai, control = hai_era, block_len = 48+100, time_series=True, rmse=False)
results_ts = comp_ts.compare(gap_len = [48], var=list(hai.columns), n_rep=1) res_ts = results_ts.query("method != 'Kalman Filter'")
res_ts_plot = pd.concat([unnest_predictions(row, ctx_len=72) for _,row in res_ts.iterrows()])scale_sota = alt.Scale(domain=["ERA-I", "MDS"], range=list(sns.color_palette('Dark2', 3).as_hex())[1:])p = (facet_wrap(res_ts_plot, partial(_plot_timeseries, scale_color=scale_sota, err_band = False), col="var",
y_labels = _get_labels(res_ts_plot, 'mean', None),
)
.pipe(plot_formatter)
)
save_show_plot(p, "timeseries_sota")
Percentage improvement
results_Av.method.unique()['Kalman Filter', 'ERA-I', 'MDS']
Categories (3, object): ['Kalman Filter' < 'ERA-I' < 'MDS']
all_res = results_Av.query('var != "P"').groupby(['method']).agg({'rmse_stand': 'mean'}).Tall_res| method | Kalman Filter | ERA-I | MDS |
|---|---|---|---|
| rmse_stand | 0.204628 | 0.307361 | 0.482837 |
percentage of improvement across all variables
(all_res["ERA-I"] - all_res["Kalman Filter"]) / all_res["ERA-I"] * 100 rmse_stand 33.42398
dtype: float64
(all_res["MDS"] - all_res["Kalman Filter"]) / all_res["MDS"] * 100 rmse_stand 57.619542
dtype: float64
res_var = results_Av.groupby(['method', 'var']).agg({'rmse_stand': 'mean'}) res_var = res_var.reset_index().pivot(columns='method', values='rmse_stand', index='var')pd.DataFrame({'ERA': (res_var["ERA-I"] - res_var["Kalman Filter"]) / res_var["ERA-I"] * 100, 'MDS': (res_var["MDS"] - res_var["Kalman Filter"]) / res_var["MDS"] * 100 })| ERA | MDS | |
|---|---|---|
| var | ||
| TA | 54.540802 | 77.713711 |
| SW_IN | 12.004508 | 35.516142 |
| LW_IN | 5.166063 | 52.289627 |
| VPD | 44.402821 | 65.407769 |
| WS | 21.064305 | 40.321732 |
| PA | 28.784191 | 90.751559 |
| P | -18.544370 | -22.084360 |
| SWC | NaN | 41.543006 |
| TS | NaN | 25.772326 |
res_var2 = results_Av.groupby(['method', 'var', 'gap_len']).agg({'rmse_stand': 'mean'}) res_var2 = res_var2.reset_index().pivot(columns='method', values='rmse_stand', index=['var', 'gap_len'])pd.DataFrame({'ERA': (res_var2["ERA-I"] - res_var2["Kalman Filter"]) / res_var2["ERA-I"] * 100, 'MDS': (res_var2["MDS"] - res_var2["Kalman Filter"]) / res_var2["MDS"] * 100 })| ERA | MDS | ||
|---|---|---|---|
| var | gap_len | ||
| TA | 6 | 69.897582 | 85.052698 |
| 12 | 58.766166 | 79.376385 | |
| 24 | 51.538443 | 75.395970 | |
| 168 | 41.823614 | 73.000401 | |
| SW_IN | 6 | 9.519984 | 29.746651 |
| 12 | 11.165399 | 30.639223 | |
| 24 | 14.232051 | 34.811941 | |
| 168 | 12.305658 | 42.651906 | |
| LW_IN | 6 | 21.023524 | 59.136518 |
| 12 | 9.110040 | 52.211404 | |
| 24 | -3.553292 | 50.720632 | |
| 168 | -4.260023 | 48.223005 | |
| VPD | 6 | 66.980942 | 79.449579 |
| 12 | 47.785633 | 69.081018 | |
| 24 | 33.663749 | 56.728120 | |
| 168 | 32.272332 | 57.702579 | |
| WS | 6 | 32.402977 | 45.724043 |
| 12 | 25.209162 | 43.275430 | |
| 24 | 15.543672 | 37.142502 | |
| 168 | 12.735569 | 36.436106 | |
| PA | 6 | 39.823585 | 91.511486 |
| 12 | 30.995845 | 90.532461 | |
| 24 | 24.727301 | 89.319180 | |
| 168 | 20.691181 | 91.421434 | |
| P | 6 | -18.485009 | -13.917879 |
| 12 | -28.935358 | -37.127331 | |
| 24 | -24.423076 | -29.998707 | |
| 168 | -7.725322 | -11.587796 | |
| SWC | 6 | NaN | 61.302664 |
| 12 | NaN | 47.976950 | |
| 24 | NaN | 42.535719 | |
| 168 | NaN | 23.301469 | |
| TS | 6 | NaN | 64.264901 |
| 12 | NaN | 46.699870 | |
| 24 | NaN | 27.050291 | |
| 168 | NaN | -15.268479 |
Main plot
from itertools import product
import altair as altp = the_plot(results_Av)
save_show_plot(p, "the_plot")
p = the_plot_stand(results_Av)
save_show_plot(p, "the_plot_stand")
Table
t = the_table(results_Av)
the_table_latex(t, base_path_tbl / "the_table.tex", label="tbl:the_table",
caption="\\CapTheTable")
t| Kalman Filter | ERA-I | MDS | |||||
|---|---|---|---|---|---|---|---|
| RMSE | mean | std | mean | std | mean | std | |
| Variable | Gap | ||||||
| TA | 6 h | 0.405453 | 0.258301 | 1.346910 | 0.997843 | 2.712546 | 1.896914 |
| 12 h | 0.606836 | 0.400849 | 1.471695 | 0.900611 | 2.942435 | 1.748131 | |
| 1 day (24 h) | 0.741275 | 0.368468 | 1.529614 | 0.800256 | 3.012819 | 1.611311 | |
| 1 week (168 h) | 1.020608 | 0.444591 | 1.754334 | 0.643160 | 3.780087 | 1.315472 | |
| SW_IN | 6 h | 44.636609 | 40.464629 | 49.333113 | 66.241975 | 63.536627 | 85.401585 |
| 12 h | 48.155186 | 33.868178 | 54.207691 | 49.769296 | 69.427115 | 68.936352 | |
| 1 day (24 h) | 56.564277 | 30.042752 | 65.950367 | 40.930505 | 86.770917 | 59.603564 | |
| 1 week (168 h) | 61.582820 | 25.740161 | 70.224393 | 34.883199 | 107.384249 | 53.606111 | |
| LW_IN | 6 h | 10.902409 | 7.736087 | 13.804628 | 12.987987 | 26.680077 | 15.022366 |
| 12 h | 13.421656 | 7.734502 | 14.766929 | 12.584725 | 28.085478 | 13.457335 | |
| 1 day (24 h) | 14.593819 | 7.840046 | 14.093052 | 12.227900 | 29.614461 | 12.416763 | |
| 1 week (168 h) | 17.062880 | 6.425136 | 16.365697 | 11.129569 | 32.954558 | 8.833972 | |
| VPD | 6 h | 0.428187 | 0.363168 | 1.296787 | 1.547397 | 2.083592 | 2.149288 |
| 12 h | 0.660623 | 0.504761 | 1.265213 | 1.288794 | 2.136626 | 2.095549 | |
| 1 day (24 h) | 0.827563 | 0.501975 | 1.247527 | 1.032319 | 1.912472 | 1.605013 | |
| 1 week (168 h) | 1.125680 | 0.633392 | 1.662069 | 1.127314 | 2.661345 | 1.965431 | |
| WS | 6 h | 0.616774 | 0.316972 | 0.912428 | 0.508295 | 1.136367 | 0.783146 |
| 12 h | 0.715412 | 0.350974 | 0.956550 | 0.524247 | 1.261203 | 0.796744 | |
| 1 day (24 h) | 0.801851 | 0.343378 | 0.949427 | 0.446912 | 1.275665 | 0.608630 | |
| 1 week (168 h) | 0.950211 | 0.363124 | 1.088887 | 0.348541 | 1.494891 | 0.615371 | |
| PA | 6 h | 0.045046 | 0.034294 | 0.074856 | 0.061726 | 0.530665 | 0.441476 |
| 12 h | 0.053359 | 0.041613 | 0.077328 | 0.058476 | 0.563603 | 0.427426 | |
| 1 day (24 h) | 0.059481 | 0.038666 | 0.079021 | 0.051491 | 0.556899 | 0.404451 | |
| 1 week (168 h) | 0.066325 | 0.047544 | 0.083628 | 0.053654 | 0.773143 | 0.384029 | |
| P | 6 h | 0.134093 | 0.274033 | 0.113173 | 0.315504 | 0.117710 | 0.305539 |
| 12 h | 0.178871 | 0.295419 | 0.138729 | 0.297227 | 0.130442 | 0.281377 | |
| 1 day (24 h) | 0.206231 | 0.253588 | 0.165750 | 0.288432 | 0.158641 | 0.265257 | |
| 1 week (168 h) | 0.239885 | 0.173820 | 0.222682 | 0.201782 | 0.214975 | 0.197499 | |
| SWC | 6 h | 0.508379 | 0.487342 | NaN | NaN | 1.313730 | 1.556829 |
| 12 h | 0.664855 | 0.471849 | NaN | NaN | 1.278001 | 1.323011 | |
| 1 day (24 h) | 0.779066 | 0.640996 | NaN | NaN | 1.355740 | 1.472185 | |
| 1 week (168 h) | 1.493784 | 0.947799 | NaN | NaN | 1.947605 | 1.488284 | |
| TS | 6 h | 0.341080 | 0.431992 | NaN | NaN | 0.954469 | 0.889126 |
| 12 h | 0.534363 | 0.783787 | NaN | NaN | 1.002555 | 0.876784 | |
| 1 day (24 h) | 0.786670 | 0.851931 | NaN | NaN | 1.078373 | 0.856964 | |
| 1 week (168 h) | 1.659875 | 1.077782 | NaN | NaN | 1.440008 | 0.764040 | |
t = the_table(results_Av, 'rmse_stand', y_name="Stand. RMSE")
the_table_latex(t, base_path_tbl / "the_table_stand.tex", stand = True, label="tbl:the_table_stand",
caption = "\\CapTheTableStand")
t| Kalman Filter | ERA-I | MDS | |||||
|---|---|---|---|---|---|---|---|
| Stand. RMSE | mean | std | mean | std | mean | std | |
| Variable | Gap | ||||||
| TA | 6 h | 0.051164 | 0.032595 | 0.169965 | 0.125917 | 0.342294 | 0.239370 |
| 12 h | 0.076576 | 0.050583 | 0.185712 | 0.113647 | 0.371303 | 0.220595 | |
| 1 day (24 h) | 0.093541 | 0.046497 | 0.193021 | 0.100984 | 0.380185 | 0.203330 | |
| 1 week (168 h) | 0.128790 | 0.056103 | 0.221378 | 0.081160 | 0.477006 | 0.165998 | |
| SW_IN | 6 h | 0.218804 | 0.198354 | 0.241826 | 0.324711 | 0.311450 | 0.418630 |
| 12 h | 0.236052 | 0.166018 | 0.265721 | 0.243964 | 0.340325 | 0.337919 | |
| 1 day (24 h) | 0.277272 | 0.147267 | 0.323282 | 0.200637 | 0.425342 | 0.292171 | |
| 1 week (168 h) | 0.301873 | 0.126176 | 0.344233 | 0.170994 | 0.526387 | 0.262772 | |
| LW_IN | 6 h | 0.259855 | 0.184387 | 0.329028 | 0.309564 | 0.635910 | 0.358053 |
| 12 h | 0.319900 | 0.184349 | 0.351964 | 0.299952 | 0.669407 | 0.320751 | |
| 1 day (24 h) | 0.347838 | 0.186865 | 0.335903 | 0.291448 | 0.705850 | 0.295949 | |
| 1 week (168 h) | 0.406688 | 0.153141 | 0.390071 | 0.265269 | 0.785460 | 0.210555 | |
| VPD | 6 h | 0.098019 | 0.083135 | 0.296855 | 0.354224 | 0.476967 | 0.492006 |
| 12 h | 0.151227 | 0.115548 | 0.289627 | 0.295025 | 0.489108 | 0.479704 | |
| 1 day (24 h) | 0.189442 | 0.114910 | 0.285579 | 0.236314 | 0.437795 | 0.367413 | |
| 1 week (168 h) | 0.257686 | 0.144994 | 0.380474 | 0.258060 | 0.609224 | 0.449918 | |
| WS | 6 h | 0.379454 | 0.195008 | 0.561347 | 0.312715 | 0.699120 | 0.481810 |
| 12 h | 0.440138 | 0.215927 | 0.588492 | 0.322529 | 0.775922 | 0.490176 | |
| 1 day (24 h) | 0.493318 | 0.211254 | 0.584110 | 0.274951 | 0.784819 | 0.374443 | |
| 1 week (168 h) | 0.584592 | 0.223403 | 0.669909 | 0.214431 | 0.919692 | 0.378591 | |
| PA | 6 h | 0.052675 | 0.040103 | 0.087534 | 0.072180 | 0.620545 | 0.516250 |
| 12 h | 0.062397 | 0.048661 | 0.090425 | 0.068381 | 0.659061 | 0.499820 | |
| 1 day (24 h) | 0.069556 | 0.045215 | 0.092405 | 0.060212 | 0.651223 | 0.472953 | |
| 1 week (168 h) | 0.077558 | 0.055597 | 0.097793 | 0.062741 | 0.904092 | 0.449073 | |
| P | 6 h | 0.478431 | 0.977725 | 0.403790 | 1.125691 | 0.419979 | 1.090136 |
| 12 h | 0.638197 | 1.054031 | 0.494974 | 1.060481 | 0.465404 | 1.003928 | |
| 1 day (24 h) | 0.735816 | 0.904779 | 0.591382 | 1.029100 | 0.566018 | 0.946414 | |
| 1 week (168 h) | 0.855891 | 0.620173 | 0.794512 | 0.719941 | 0.767011 | 0.704660 | |
| SWC | 6 h | 0.057037 | 0.054677 | NaN | NaN | 0.147393 | 0.174667 |
| 12 h | 0.074593 | 0.052939 | NaN | NaN | 0.143384 | 0.148434 | |
| 1 day (24 h) | 0.087407 | 0.071916 | NaN | NaN | 0.152106 | 0.165171 | |
| 1 week (168 h) | 0.167594 | 0.106338 | NaN | NaN | 0.218510 | 0.166977 | |
| TS | 6 h | 0.060276 | 0.076342 | NaN | NaN | 0.168674 | 0.157127 |
| 12 h | 0.094433 | 0.138512 | NaN | NaN | 0.177172 | 0.154946 | |
| 1 day (24 h) | 0.139021 | 0.150554 | NaN | NaN | 0.190571 | 0.151443 | |
| 1 week (168 h) | 0.293335 | 0.190466 | NaN | NaN | 0.254479 | 0.135022 | |
Timeseries
@cache_disk(cache_dir / "the_results_ts")
def get_the_results_ts():
reset_seed()
comp_Av = ImpComparison(models = models_var, df = hai, control = hai_era, block_len = 446, time_series=True, rmse=False)
results_Av = comp_Av.compare(gap_len = [12,24, 336], var=list(hai.columns), n_rep=4)
return results_Av
results_ts = get_the_results_ts()ts = plot_timeseries(results_ts.query("var in ['TA', 'SW_IN', 'LW_IN', 'VPD', 'WS']"), idx_rep=0)
save_show_plot(ts, "timeseries_1", altair_render=True)
%time ts = plot_timeseries(results_ts.query("var in ['PA', 'P', 'TS', 'SWC']"), idx_rep=0)
%time save_show_plot(ts, "timeseries_2", altair_render=True)CPU times: user 2.72 s, sys: 2.93 ms, total: 2.72 s
Wall time: 2.73 s
CPU times: user 7.58 s, sys: 77.8 ms, total: 7.66 s
Wall time: 11.5 s

from tqdm.auto import tqdm# @cache_disk(cache_dir / "ts_plot", rm_cache=True)
def plot_additional_ts():
for idx in tqdm(results_ts.idx_rep.unique()):
if idx == 0: continue # skip first plot as is done above
ts1 = plot_timeseries(results_ts.query("var in ['TA', 'SW_IN', 'LW_IN', 'VPD', 'WS']"), idx_rep=idx)
save_show_plot(ts1, f"timeseries_1_{idx}", altair_render=True)
ts2 = plot_timeseries(results_ts.query("var in ['PA', 'P', 'TS', 'SWC']"), idx_rep=idx)
save_show_plot(ts2, f"timeseries_2_{idx}", altair_render=True) plot_additional_ts()Kalman Filter analysis
Gap len
@cache_disk("gap_len")
def get_g_len(n_rep=n_rep):
reset_seed()
return KalmanImpComparison(models_var, hai, hai_era, block_len=48*7+100).compare(gap_len = [2,6,12,24,48,48*2, 48*3, 48*7], var=list(hai.columns), n_rep=n_rep)gap_len = get_g_len(n_rep)p = plot_gap_len(gap_len, hai, hai_era)
save_show_plot(p, "gap_len")
t = table_gap_len(gap_len)
table_gap_len_latex(t, base_path_tbl / "gap_len.tex", label="gap_len",
caption="\\CapGapLen")
t| Gap | 1 h | 3 h | 6 h | 12 h | 1 day (24 h) | 2 days (48 h) | 3 days (72 h) | 1 week (168 h) | |
|---|---|---|---|---|---|---|---|---|---|
| Variable | Stand. RMSE | ||||||||
| TA | mean | 0.025327 | 0.034172 | 0.055041 | 0.077722 | 0.107172 | 0.115631 | 0.119447 | 0.124932 |
| std | 0.022582 | 0.022085 | 0.037678 | 0.048111 | 0.059004 | 0.055879 | 0.052048 | 0.042619 | |
| SW_IN | mean | 0.141258 | 0.185323 | 0.219323 | 0.242353 | 0.283377 | 0.279351 | 0.286491 | 0.295492 |
| std | 0.164966 | 0.180712 | 0.192019 | 0.162735 | 0.139278 | 0.130424 | 0.125494 | 0.120383 | |
| LW_IN | mean | 0.122415 | 0.199663 | 0.251175 | 0.355663 | 0.363577 | 0.401382 | 0.399580 | 0.426373 |
| std | 0.159722 | 0.171639 | 0.181869 | 0.226522 | 0.188607 | 0.195204 | 0.185700 | 0.164282 | |
| VPD | mean | 0.044495 | 0.072448 | 0.106393 | 0.185943 | 0.203379 | 0.226881 | 0.238030 | 0.258977 |
| std | 0.050343 | 0.092523 | 0.093396 | 0.176841 | 0.135613 | 0.141761 | 0.133105 | 0.148695 | |
| WS | mean | 0.236801 | 0.305489 | 0.371760 | 0.473200 | 0.505951 | 0.551268 | 0.530790 | 0.564939 |
| std | 0.198199 | 0.193005 | 0.202284 | 0.273506 | 0.285058 | 0.238593 | 0.200557 | 0.195587 | |
| PA | mean | 0.026844 | 0.038072 | 0.055131 | 0.063885 | 0.068426 | 0.075262 | 0.074562 | 0.080174 |
| std | 0.026776 | 0.029629 | 0.065971 | 0.039537 | 0.050700 | 0.058926 | 0.055454 | 0.059972 | |
| P | mean | 0.246141 | 0.550198 | 0.443231 | 0.722287 | 0.693768 | 0.729318 | 0.850642 | 0.907571 |
| std | 0.764003 | 1.821625 | 0.525170 | 1.351510 | 0.782852 | 0.619818 | 0.749813 | 0.583107 | |
| SWC | mean | 0.019637 | 0.038882 | 0.053718 | 0.067853 | 0.087583 | 0.098537 | 0.113108 | 0.165704 |
| std | 0.016406 | 0.034985 | 0.036017 | 0.042192 | 0.062350 | 0.072180 | 0.086248 | 0.099118 | |
| TS | mean | 0.025595 | 0.044131 | 0.067155 | 0.105208 | 0.129654 | 0.185770 | 0.209769 | 0.315963 |
| std | 0.026281 | 0.042837 | 0.070520 | 0.144570 | 0.118676 | 0.147786 | 0.139939 | 0.218048 |
g_len_agg = gap_len.groupby('gap_len').agg({'rmse_stand': 'mean'})
(g_len_agg.iloc[0])/g_len_agg.iloc[-1]rmse_stand 0.282954
dtype: float64
g_len_agg = gap_len.groupby(['gap_len', 'var']).agg({'rmse_stand': 'mean'})
(g_len_agg.loc[1.])/g_len_agg.loc[168.]| rmse_stand | |
|---|---|
| var | |
| TA | 0.202725 |
| SW_IN | 0.478043 |
| LW_IN | 0.287107 |
| VPD | 0.171809 |
| WS | 0.419162 |
| PA | 0.334820 |
| P | 0.271208 |
| SWC | 0.118507 |
| TS | 0.081006 |
g_len_agg| rmse_stand | ||
|---|---|---|
| gap_len | var | |
| 1 | TA | 0.025327 |
| SW_IN | 0.141258 | |
| LW_IN | 0.122415 | |
| VPD | 0.044495 | |
| WS | 0.236801 | |
| ... | ... | ... |
| 168 | WS | 0.564939 |
| PA | 0.080174 | |
| P | 0.907571 | |
| SWC | 0.165704 | |
| TS | 0.315963 |
72 rows × 1 columns
g_len_agg_std = gap_len.groupby('gap_len').agg({'rmse_stand': 'std'})
(g_len_agg_std.iloc[0])/g_len_agg_std.iloc[-1]rmse_stand 0.849176
dtype: float64
(gap_len.groupby(['gap_len', 'var']).agg({'rmse_stand': 'std'})
.unstack("var")
.droplevel(0, 1)
.plot(subplots=True, layout=(3,3), figsize=(10,10)))array([[<AxesSubplot: xlabel='gap_len'>, <AxesSubplot: xlabel='gap_len'>,
<AxesSubplot: xlabel='gap_len'>],
[<AxesSubplot: xlabel='gap_len'>, <AxesSubplot: xlabel='gap_len'>,
<AxesSubplot: xlabel='gap_len'>],
[<AxesSubplot: xlabel='gap_len'>, <AxesSubplot: xlabel='gap_len'>,
<AxesSubplot: xlabel='gap_len'>]], dtype=object)

# with open(base_path_tbl / "gap_len.tex") as f:
# print(f.readlines())Control
models_nc = pd.DataFrame({'model': [ l_model("1_gap_varying_336_no_control_v1.pickle"), l_model("1_gap_varying_6-336_v3.pickle")],
'type': [ 'No Control', 'Use Control' ]}) @cache_disk("use_control")
def get_control(n_rep=n_rep):
reset_seed()
kcomp_control = KalmanImpComparison(models_nc, hai, hai_era, block_len=100+48*7)
k_results_control = kcomp_control.compare(n_rep =n_rep, gap_len = [12, 24, 48, 48*7], var = list(hai.columns))
return k_results_controlfrom time import sleepk_results_control = get_control(n_rep)k_results_control| var | loss | likelihood | gap_len | idx_rep | type | rmse | rmse_stand | gap_len_f | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | TA | -5.742477 | 2.142377 | 6 | 0 | No Control | 0.227680 | 0.028731 | 6 h |
| 1 | TA | -5.755538 | 2.148783 | 6 | 1 | No Control | 0.228037 | 0.028776 | 6 h |
| 2 | TA | -1.232816 | 2.132641 | 6 | 2 | No Control | 1.813357 | 0.228826 | 6 h |
| 3 | TA | -4.693027 | 2.118098 | 6 | 3 | No Control | 0.906512 | 0.114392 | 6 h |
| 4 | TA | -5.179578 | 2.151546 | 6 | 4 | No Control | 0.726325 | 0.091654 | 6 h |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 995 | TS | 115.193441 | 0.966260 | 168 | 495 | Use Control | 1.770984 | 0.312970 | 1 week (168 h) |
| 996 | TS | 82.913345 | 1.474181 | 168 | 496 | Use Control | 1.403469 | 0.248022 | 1 week (168 h) |
| 997 | TS | 93.694508 | 1.154413 | 168 | 497 | Use Control | 1.526679 | 0.269796 | 1 week (168 h) |
| 998 | TS | 118.712986 | 1.494637 | 168 | 498 | Use Control | 1.798757 | 0.317878 | 1 week (168 h) |
| 999 | TS | 93.694508 | 1.154413 | 168 | 499 | Use Control | 1.526679 | 0.269796 | 1 week (168 h) |
36000 rows × 9 columns
p = plot_compare(k_results_control, 'type', y = 'rmse', scale_domain=["Use Control", "No Control"])
save_show_plot(p, "use_control")
pfrom functools import partialt = table_compare(k_results_control, 'type')
table_compare_latex(t, base_path_tbl / "control.tex", label="tbl:control",
caption="\\CapControl")
t| type | Use Control | No Control | ||||||
|---|---|---|---|---|---|---|---|---|
| Stand. RMSE | mean | std | se | mean | std | se | diff. | |
| Variable | Gap | |||||||
| TA | 6 h | 0.092151 | 0.055814 | 0.002496 | 0.094594 | 0.059489 | 0.002660 | 0.002443 |
| 12 h | 0.118358 | 0.074072 | 0.003313 | 0.160712 | 0.106870 | 0.004779 | 0.042354 | |
| 1 day (24 h) | 0.146578 | 0.076799 | 0.003435 | 0.232744 | 0.159137 | 0.007117 | 0.086166 | |
| 1 week (168 h) | 0.174324 | 0.064495 | 0.002884 | 0.294072 | 0.147849 | 0.006612 | 0.119748 | |
| SW_IN | 6 h | 0.255795 | 0.164694 | 0.007365 | 0.331166 | 0.270628 | 0.012103 | 0.075371 |
| 12 h | 0.314755 | 0.164839 | 0.007372 | 0.518890 | 0.319771 | 0.014301 | 0.204135 | |
| 1 day (24 h) | 0.339310 | 0.128516 | 0.005747 | 0.608981 | 0.277150 | 0.012395 | 0.269671 | |
| 1 week (168 h) | 0.357922 | 0.103167 | 0.004614 | 0.693166 | 0.248412 | 0.011109 | 0.335244 | |
| LW_IN | 6 h | 0.299442 | 0.201432 | 0.009008 | 0.330486 | 0.185322 | 0.008288 | 0.031044 |
| 12 h | 0.367537 | 0.197000 | 0.008810 | 0.485509 | 0.193452 | 0.008651 | 0.117972 | |
| 1 day (24 h) | 0.400412 | 0.175611 | 0.007854 | 0.561446 | 0.186706 | 0.008350 | 0.161035 | |
| 1 week (168 h) | 0.466036 | 0.141424 | 0.006325 | 0.666359 | 0.161485 | 0.007222 | 0.200323 | |
| VPD | 6 h | 0.137863 | 0.104531 | 0.004675 | 0.178626 | 0.139476 | 0.006238 | 0.040762 |
| 12 h | 0.226616 | 0.183575 | 0.008210 | 0.320797 | 0.262730 | 0.011750 | 0.094181 | |
| 1 day (24 h) | 0.272840 | 0.206783 | 0.009248 | 0.407586 | 0.296989 | 0.013282 | 0.134746 | |
| 1 week (168 h) | 0.325113 | 0.154287 | 0.006900 | 0.499408 | 0.252086 | 0.011274 | 0.174294 | |
| WS | 6 h | 0.571539 | 0.523304 | 0.023403 | 0.429625 | 0.262009 | 0.011717 | -0.141914 |
| 12 h | 0.660302 | 0.408751 | 0.018280 | 0.680223 | 0.391765 | 0.017520 | 0.019921 | |
| 1 day (24 h) | 0.676472 | 0.374177 | 0.016734 | 0.767313 | 0.430367 | 0.019247 | 0.090841 | |
| 1 week (168 h) | 0.781636 | 0.264823 | 0.011843 | 0.888977 | 0.318094 | 0.014226 | 0.107340 | |
| PA | 6 h | 0.099325 | 0.066109 | 0.002956 | 0.144519 | 0.105120 | 0.004701 | 0.045194 |
| 12 h | 0.116360 | 0.061026 | 0.002729 | 0.338753 | 0.209793 | 0.009382 | 0.222393 | |
| 1 day (24 h) | 0.136620 | 0.076087 | 0.003403 | 0.562996 | 0.378688 | 0.016935 | 0.426376 | |
| 1 week (168 h) | 0.158988 | 0.068315 | 0.003055 | 0.879748 | 0.439831 | 0.019670 | 0.720760 | |
| P | 6 h | 0.542198 | 0.990843 | 0.044312 | 0.507613 | 0.973715 | 0.043546 | -0.034584 |
| 12 h | 0.627167 | 1.034936 | 0.046284 | 0.605429 | 1.025064 | 0.045842 | -0.021738 | |
| 1 day (24 h) | 0.628179 | 0.575428 | 0.025734 | 0.614481 | 0.583966 | 0.026116 | -0.013698 | |
| 1 week (168 h) | 0.880507 | 0.572737 | 0.025614 | 0.860985 | 0.597007 | 0.026699 | -0.019522 | |
| SWC | 6 h | 0.152144 | 0.107350 | 0.004801 | 0.151053 | 0.141370 | 0.006322 | -0.001092 |
| 12 h | 0.244768 | 0.154341 | 0.006902 | 0.305445 | 0.233699 | 0.010451 | 0.060678 | |
| 1 day (24 h) | 0.392553 | 0.222757 | 0.009962 | 0.538003 | 0.337738 | 0.015104 | 0.145450 | |
| 1 week (168 h) | 0.719138 | 0.314318 | 0.014057 | 0.842404 | 0.431371 | 0.019292 | 0.123267 | |
| TS | 6 h | 0.168111 | 0.114636 | 0.005127 | 0.118928 | 0.076226 | 0.003409 | -0.049183 |
| 12 h | 0.226302 | 0.134730 | 0.006025 | 0.203284 | 0.132946 | 0.005946 | -0.023019 | |
| 1 day (24 h) | 0.286258 | 0.152198 | 0.006807 | 0.285367 | 0.174358 | 0.007798 | -0.000891 | |
| 1 week (168 h) | 0.362279 | 0.176402 | 0.007889 | 0.359044 | 0.191313 | 0.008556 | -0.003235 | |
Gap in Other variables
models_gap_single = pd.DataFrame.from_records([
{'Gap': 'All variables', 'gap_single_var': False, 'model': l_model("all_varying_gap_varying_len_6-30_v3.pickle")},
{'Gap': 'Only one var', 'gap_single_var': True, 'model': l_model("all_varying_gap_varying_len_6-30_v3.pickle")},
])@cache_disk("gap_single")
def get_gap_single(n_rep):
kcomp_single = KalmanImpComparison(models_gap_single, hai, hai_era, block_len=130)
return kcomp_single.compare(n_rep =n_rep, gap_len = [6, 12, 24, 30], var = list(hai.columns))res_single = get_gap_single(n_rep)p = plot_compare(res_single, "Gap", y = 'rmse', scale_domain=["Only one var", "All variables"])
save_show_plot(p, "gap_single_var")
t = table_compare(res_single, 'Gap')
table_compare_latex(t, base_path_tbl / "gap_single_var.tex", caption="\\CapGapSingle", label="tbl:gap_single_var")
t| Gap | Only one var | All variables | ||||||
|---|---|---|---|---|---|---|---|---|
| Stand. RMSE | mean | std | se | mean | std | se | diff. | |
| Variable | Gap | |||||||
| TA | 3 h | 0.029116 | 0.026433 | 0.001182 | 0.048143 | 0.040335 | 0.001804 | 0.019027 |
| 6 h | 0.042861 | 0.031811 | 0.001423 | 0.080906 | 0.062887 | 0.002812 | 0.038045 | |
| 12 h | 0.071487 | 0.049631 | 0.002220 | 0.124425 | 0.096532 | 0.004317 | 0.052939 | |
| 15 h | 0.079382 | 0.053998 | 0.002415 | 0.135941 | 0.087736 | 0.003924 | 0.056559 | |
| SW_IN | 3 h | 0.198721 | 0.204619 | 0.009151 | 0.196330 | 0.239872 | 0.010727 | -0.002390 |
| 6 h | 0.230639 | 0.190542 | 0.008521 | 0.247159 | 0.234288 | 0.010478 | 0.016520 | |
| 12 h | 0.268135 | 0.173156 | 0.007744 | 0.279729 | 0.220169 | 0.009846 | 0.011595 | |
| 15 h | 0.254110 | 0.137783 | 0.006162 | 0.275088 | 0.184996 | 0.008273 | 0.020978 | |
| LW_IN | 3 h | 0.198619 | 0.171501 | 0.007670 | 0.204255 | 0.192592 | 0.008613 | 0.005636 |
| 6 h | 0.288557 | 0.205514 | 0.009191 | 0.285575 | 0.220568 | 0.009864 | -0.002982 | |
| 12 h | 0.336802 | 0.211468 | 0.009457 | 0.346164 | 0.234498 | 0.010487 | 0.009362 | |
| 15 h | 0.339017 | 0.183203 | 0.008193 | 0.343338 | 0.211743 | 0.009469 | 0.004321 | |
| VPD | 3 h | 0.063849 | 0.064851 | 0.002900 | 0.096974 | 0.101109 | 0.004522 | 0.033125 |
| 6 h | 0.096670 | 0.092415 | 0.004133 | 0.154050 | 0.152273 | 0.006810 | 0.057380 | |
| 12 h | 0.163176 | 0.134966 | 0.006036 | 0.235416 | 0.209535 | 0.009371 | 0.072241 | |
| 15 h | 0.170890 | 0.147536 | 0.006598 | 0.257427 | 0.230171 | 0.010294 | 0.086537 | |
| WS | 3 h | 0.299512 | 0.185594 | 0.008300 | 0.300374 | 0.185026 | 0.008275 | 0.000862 |
| 6 h | 0.394661 | 0.234428 | 0.010484 | 0.396053 | 0.237782 | 0.010634 | 0.001392 | |
| 12 h | 0.479464 | 0.249523 | 0.011159 | 0.499147 | 0.264118 | 0.011812 | 0.019682 | |
| 15 h | 0.470032 | 0.218304 | 0.009763 | 0.481117 | 0.234884 | 0.010504 | 0.011084 | |
| PA | 3 h | 0.024250 | 0.015962 | 0.000714 | 0.028151 | 0.017252 | 0.000772 | 0.003902 |
| 6 h | 0.036931 | 0.024932 | 0.001115 | 0.042390 | 0.031342 | 0.001402 | 0.005459 | |
| 12 h | 0.050980 | 0.025147 | 0.001125 | 0.055303 | 0.026073 | 0.001166 | 0.004323 | |
| 15 h | 0.054699 | 0.037828 | 0.001692 | 0.057890 | 0.042331 | 0.001893 | 0.003191 | |
| P | 3 h | 0.338024 | 0.665541 | 0.029764 | 0.322645 | 0.525352 | 0.023494 | -0.015379 |
| 6 h | 0.380737 | 0.804932 | 0.035998 | 0.405015 | 0.923454 | 0.041298 | 0.024277 | |
| 12 h | 0.428597 | 0.572176 | 0.025588 | 0.455575 | 0.680281 | 0.030423 | 0.026978 | |
| 15 h | 0.484958 | 1.048699 | 0.046899 | 0.507848 | 1.212676 | 0.054232 | 0.022889 | |
| SWC | 3 h | 0.012567 | 0.019060 | 0.000852 | 0.013736 | 0.023656 | 0.001058 | 0.001169 |
| 6 h | 0.017016 | 0.026366 | 0.001179 | 0.021119 | 0.033282 | 0.001488 | 0.004103 | |
| 12 h | 0.029611 | 0.049173 | 0.002199 | 0.033421 | 0.048640 | 0.002175 | 0.003810 | |
| 15 h | 0.031970 | 0.045392 | 0.002030 | 0.039874 | 0.055206 | 0.002469 | 0.007904 | |
| TS | 3 h | 0.028479 | 0.024754 | 0.001107 | 0.032860 | 0.028891 | 0.001292 | 0.004381 |
| 6 h | 0.045677 | 0.047974 | 0.002145 | 0.061144 | 0.057966 | 0.002592 | 0.015467 | |
| 12 h | 0.088208 | 0.091400 | 0.004088 | 0.114275 | 0.118003 | 0.005277 | 0.026068 | |
| 15 h | 0.101862 | 0.103646 | 0.004635 | 0.131989 | 0.130062 | 0.005817 | 0.030128 | |
res_singl_perc = res_single.groupby(['Gap', 'var', 'gap_len']).agg({'rmse_stand': 'mean'}).reset_index().pivot(columns = 'Gap', values='rmse_stand', index=['var', 'gap_len'])pd.DataFrame({'Only one var': (res_singl_perc["All variables"] - res_singl_perc["Only one var"]) / res_singl_perc["All variables"] * 100})| Only one var | ||
|---|---|---|
| var | gap_len | |
| TA | 3 | 39.521284 |
| 6 | 47.023566 | |
| 12 | 42.546605 | |
| 15 | 41.605866 | |
| SW_IN | 3 | -1.217559 |
| 6 | 6.684132 | |
| 12 | 4.144939 | |
| 15 | 7.625968 | |
| LW_IN | 3 | 2.759433 |
| 6 | -1.044157 | |
| 12 | 2.704600 | |
| 15 | 1.258656 | |
| VPD | 3 | 34.158966 |
| 6 | 37.247869 | |
| 12 | 30.686333 | |
| 15 | 33.616012 | |
| WS | 3 | 0.286928 |
| 6 | 0.351345 | |
| 12 | 3.943177 | |
| 15 | 2.303891 | |
| PA | 3 | 13.860326 |
| 6 | 12.877279 | |
| 12 | 7.816283 | |
| 15 | 5.511460 | |
| P | 3 | -4.766508 |
| 6 | 5.994219 | |
| 12 | 5.921732 | |
| 15 | 4.507125 | |
| SWC | 3 | 8.507126 |
| 6 | 19.428107 | |
| 12 | 11.400826 | |
| 15 | 19.823475 | |
| TS | 3 | 13.331333 |
| 6 | 25.295932 | |
| 12 | 22.811204 | |
| 15 | 22.825776 |
res_singl_perc = res_single.groupby(['Gap', 'var']).agg({'rmse_stand': 'mean'}).reset_index().pivot(columns = 'Gap', values='rmse_stand', index=['var'])pd.DataFrame({'Only one var': (res_singl_perc["All variables"] - res_singl_perc["Only one var"]) / res_singl_perc["All variables"] * 100})| Only one var | |
|---|---|
| var | |
| TA | 42.774328 |
| SW_IN | 4.678197 |
| LW_IN | 1.385379 |
| VPD | 33.511756 |
| WS | 1.969357 |
| PA | 9.183783 |
| P | 3.475042 |
| SWC | 15.706237 |
| TS | 22.347878 |
Generic vs Specialized
models_generic = models_var.copy()models_generic.model = l_model("1_gap_varying_6-336_v3.pickle")
models_generic['type'] = 'Generic'models_generic| var | model | type | |
|---|---|---|---|
| 0 | TA | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
| 1 | SW_IN | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
| 2 | LW_IN | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
| 3 | VPD | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
| 4 | WS | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
| 5 | PA | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
| 6 | P | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
| 7 | TS | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
| 8 | SWC | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
models_var['type'] = 'Fine-tuned one var'models_gen_vs_spec = pd.concat([models_generic, models_var])models_gen_vs_spec| var | model | type | |
|---|---|---|---|
| 0 | TA | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
| 1 | SW_IN | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
| 2 | LW_IN | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
| 3 | VPD | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
| 4 | WS | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
| 5 | PA | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
| 6 | P | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
| 7 | TS | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
| 8 | SWC | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Generic |
| 0 | TA | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Fine-tuned one var |
| 1 | SW_IN | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Fine-tuned one var |
| 2 | LW_IN | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Fine-tuned one var |
| 3 | VPD | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Fine-tuned one var |
| 4 | WS | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Fine-tuned one var |
| 5 | PA | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Fine-tuned one var |
| 6 | P | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Fine-tuned one var |
| 7 | TS | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Fine-tuned one var |
| 8 | SWC | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 | Fine-tuned one var |
@cache_disk("generic")
def get_generic(n_rep=n_rep):
reset_seed()
comp_generic = KalmanImpComparison(models_gen_vs_spec, hai, hai_era, block_len=100+48*7)
return comp_generic.compare(n_rep =n_rep, gap_len = [12, 24, 48, 48*7], var = list(hai.columns))
k_results_generic = get_generic(n_rep)plot_formatter.legend_symbol_size = 300p = plot_compare(k_results_generic, 'type', y = 'rmse', scale_domain=["Fine-tuned one var", "Generic"])
save_show_plot(p, "generic")
pt = table_compare(k_results_generic, 'type')
table_compare_latex(t, base_path_tbl / "generic.tex", label='tbl:generic', caption="\\CapGeneric")
t| type | Generic | Fine-tuned one var | ||||||
|---|---|---|---|---|---|---|---|---|
| Stand. RMSE | mean | std | se | mean | std | se | diff. | |
| Variable | Gap | |||||||
| TA | 6 h | 0.092151 | 0.055814 | 0.002496 | 0.054999 | 0.032663 | 0.001461 | -0.037153 |
| 12 h | 0.118358 | 0.074072 | 0.003313 | 0.077345 | 0.052042 | 0.002327 | -0.041013 | |
| 1 day (24 h) | 0.146578 | 0.076799 | 0.003435 | 0.100970 | 0.058986 | 0.002638 | -0.045608 | |
| 1 week (168 h) | 0.174324 | 0.064495 | 0.002884 | 0.128855 | 0.048376 | 0.002163 | -0.045469 | |
| SW_IN | 6 h | 0.255795 | 0.164694 | 0.007365 | 0.208728 | 0.167255 | 0.007480 | -0.047067 |
| 12 h | 0.314755 | 0.164839 | 0.007372 | 0.257761 | 0.163914 | 0.007330 | -0.056994 | |
| 1 day (24 h) | 0.339310 | 0.128516 | 0.005747 | 0.280578 | 0.131607 | 0.005886 | -0.058733 | |
| 1 week (168 h) | 0.357922 | 0.103167 | 0.004614 | 0.297530 | 0.118822 | 0.005314 | -0.060391 | |
| LW_IN | 6 h | 0.299442 | 0.201432 | 0.009008 | 0.256844 | 0.199753 | 0.008933 | -0.042598 |
| 12 h | 0.367537 | 0.197000 | 0.008810 | 0.326104 | 0.209362 | 0.009363 | -0.041433 | |
| 1 day (24 h) | 0.400412 | 0.175611 | 0.007854 | 0.358755 | 0.185939 | 0.008315 | -0.041657 | |
| 1 week (168 h) | 0.466036 | 0.141424 | 0.006325 | 0.405833 | 0.165308 | 0.007393 | -0.060203 | |
| VPD | 6 h | 0.137863 | 0.104531 | 0.004675 | 0.096682 | 0.089465 | 0.004001 | -0.041182 |
| 12 h | 0.226616 | 0.183575 | 0.008210 | 0.169084 | 0.164870 | 0.007373 | -0.057532 | |
| 1 day (24 h) | 0.272840 | 0.206783 | 0.009248 | 0.202490 | 0.159921 | 0.007152 | -0.070350 | |
| 1 week (168 h) | 0.325113 | 0.154287 | 0.006900 | 0.263913 | 0.153488 | 0.006864 | -0.061201 | |
| WS | 6 h | 0.571539 | 0.523304 | 0.023403 | 0.365262 | 0.200693 | 0.008975 | -0.206276 |
| 12 h | 0.660302 | 0.408751 | 0.018280 | 0.481919 | 0.265006 | 0.011851 | -0.178383 | |
| 1 day (24 h) | 0.676472 | 0.374177 | 0.016734 | 0.510818 | 0.270480 | 0.012096 | -0.165653 | |
| 1 week (168 h) | 0.781636 | 0.264823 | 0.011843 | 0.569137 | 0.189984 | 0.008496 | -0.212499 | |
| PA | 6 h | 0.099325 | 0.066109 | 0.002956 | 0.054017 | 0.032223 | 0.001441 | -0.045308 |
| 12 h | 0.116360 | 0.061026 | 0.002729 | 0.059966 | 0.030824 | 0.001378 | -0.056394 | |
| 1 day (24 h) | 0.136620 | 0.076087 | 0.003403 | 0.072074 | 0.068788 | 0.003076 | -0.064546 | |
| 1 week (168 h) | 0.158988 | 0.068315 | 0.003055 | 0.081116 | 0.053864 | 0.002409 | -0.077872 | |
| P | 6 h | 0.542198 | 0.990843 | 0.044312 | 0.542198 | 0.990843 | 0.044312 | 0.000000 |
| 12 h | 0.627167 | 1.034936 | 0.046284 | 0.627167 | 1.034936 | 0.046284 | 0.000000 | |
| 1 day (24 h) | 0.628179 | 0.575428 | 0.025734 | 0.628179 | 0.575428 | 0.025734 | 0.000000 | |
| 1 week (168 h) | 0.880507 | 0.572737 | 0.025614 | 0.880507 | 0.572737 | 0.025614 | 0.000000 | |
| SWC | 6 h | 0.152144 | 0.107350 | 0.004801 | 0.050597 | 0.033592 | 0.001502 | -0.101547 |
| 12 h | 0.244768 | 0.154341 | 0.006902 | 0.067366 | 0.043308 | 0.001937 | -0.177402 | |
| 1 day (24 h) | 0.392553 | 0.222757 | 0.009962 | 0.078784 | 0.053435 | 0.002390 | -0.313769 | |
| 1 week (168 h) | 0.719138 | 0.314318 | 0.014057 | 0.169313 | 0.103684 | 0.004637 | -0.549825 | |
| TS | 6 h | 0.168111 | 0.114636 | 0.005127 | 0.065352 | 0.068001 | 0.003041 | -0.102759 |
| 12 h | 0.226302 | 0.134730 | 0.006025 | 0.088370 | 0.087491 | 0.003913 | -0.137932 | |
| 1 day (24 h) | 0.286258 | 0.152198 | 0.006807 | 0.129457 | 0.127674 | 0.005710 | -0.156801 | |
| 1 week (168 h) | 0.362279 | 0.176402 | 0.007889 | 0.316745 | 0.217719 | 0.009737 | -0.045534 | |
res_singl_perc = k_results_generic.groupby(['type', 'var']).agg({'rmse_stand': 'mean'}).reset_index().pivot(columns = 'type', values='rmse_stand', index=['var'])(res_singl_perc["Generic"] - res_singl_perc["Fine-tuned one var"]) / res_singl_perc["Generic"] * 100var
TA 31.847749
SW_IN 17.604408
LW_IN 12.122588
VPD 23.925198
WS 28.357855
PA 47.745394
P 0.000000
SWC 75.735168
TS 42.478176
dtype: float64
Training
models_train = pd.DataFrame.from_records([
# {'Train': 'All variables', 'model': l_model("All_gap_all_30_v1.pickle") },
{'Train': 'Only one var', 'model': l_model("1_gap_varying_6-336_v3.pickle") },
{'Train': 'Multi vars', 'model': l_model("all_varying_gap_varying_len_6-30_v3.pickle") },
{'Train': 'Random params', 'model': l_model("rand_all_varying_gap_varying_len_6-30_v4.pickle") }
])models_train| Train | model | |
|---|---|---|
| 0 | Only one var | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 |
| 1 | Multi vars | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 |
| 2 | Random params | Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 |
@cache_disk("train")
def get_train(n_rep):
reset_seed()
kcomp = KalmanImpComparison(models_train, hai, hai_era, block_len=130)
return kcomp.compare(n_rep =n_rep, gap_len = [6, 12, 24, 30], var = list(hai.columns))res_train = get_train(n_rep)res_train_agg = res_train.groupby(['Train', 'gap_len']).agg({'rmse_stand': 'mean'}).reset_index()res_train_agg| Train | gap_len | rmse_stand | |
|---|---|---|---|
| 0 | Multi vars | 3 | 0.131760 |
| 1 | Multi vars | 6 | 0.166202 |
| 2 | Multi vars | 12 | 0.215978 |
| 3 | Multi vars | 15 | 0.225579 |
| 4 | Only one var | 3 | 0.192683 |
| 5 | Only one var | 6 | 0.250933 |
| 6 | Only one var | 12 | 0.332169 |
| 7 | Only one var | 15 | 0.340622 |
| 8 | Random params | 3 | 0.200750 |
| 9 | Random params | 6 | 0.236855 |
| 10 | Random params | 12 | 0.292697 |
| 11 | Random params | 15 | 0.294936 |
p = plot_compare(res_train, "Train", y='rmse', scale_domain=["Multi vars", "Only one var", "Random params"])
save_show_plot(p, "train_compare")
t = table_compare3(res_train, 'Train')
table_compare3_latex(t, base_path_tbl / "train.tex", label="tbl:train_compare", caption="\\CapTrain")
t| Train | Multi vars | Only one var | Random params | ||||
|---|---|---|---|---|---|---|---|
| Stand. RMSE | mean | std | mean | std | mean | std | |
| Variable | Gap [$h$] | ||||||
| TA | 3 h | 0.028752 | 0.023775 | 0.066495 | 0.049728 | 0.060459 | 0.045142 |
| 6 h | 0.044355 | 0.032665 | 0.092593 | 0.062233 | 0.079903 | 0.055433 | |
| 12 h | 0.072632 | 0.049516 | 0.133494 | 0.091525 | 0.117427 | 0.085482 | |
| 15 h | 0.077183 | 0.051459 | 0.130566 | 0.071501 | 0.111204 | 0.067554 | |
| SW_IN | 3 h | 0.178118 | 0.175674 | 0.201927 | 0.178423 | 0.259459 | 0.219808 |
| 6 h | 0.210377 | 0.170297 | 0.249492 | 0.175702 | 0.282919 | 0.209136 | |
| 12 h | 0.268875 | 0.172214 | 0.307884 | 0.167333 | 0.333024 | 0.192553 | |
| 15 h | 0.269610 | 0.165685 | 0.315006 | 0.164416 | 0.333722 | 0.187786 | |
| LW_IN | 3 h | 0.200445 | 0.169263 | 0.215081 | 0.177105 | 0.298262 | 0.202726 |
| 6 h | 0.269851 | 0.215872 | 0.296780 | 0.229277 | 0.341691 | 0.228729 | |
| 12 h | 0.331508 | 0.224076 | 0.376745 | 0.241035 | 0.398407 | 0.259110 | |
| 15 h | 0.356784 | 0.208411 | 0.389550 | 0.207828 | 0.398491 | 0.224671 | |
| VPD | 3 h | 0.064145 | 0.060110 | 0.097167 | 0.079462 | 0.150077 | 0.127847 |
| 6 h | 0.098231 | 0.096103 | 0.147885 | 0.112805 | 0.193383 | 0.174094 | |
| 12 h | 0.168481 | 0.155672 | 0.222713 | 0.160915 | 0.240388 | 0.192226 | |
| 15 h | 0.185879 | 0.158725 | 0.241190 | 0.167722 | 0.254005 | 0.210471 | |
| WS | 3 h | 0.305451 | 0.182717 | 0.471322 | 0.459590 | 0.447337 | 0.299554 |
| 6 h | 0.371935 | 0.210461 | 0.575585 | 0.465884 | 0.481752 | 0.308642 | |
| 12 h | 0.429368 | 0.195014 | 0.649132 | 0.481338 | 0.523181 | 0.269521 | |
| 15 h | 0.478824 | 0.230152 | 0.690806 | 0.456658 | 0.540468 | 0.292931 | |
| PA | 3 h | 0.024824 | 0.018937 | 0.073198 | 0.061610 | 0.067156 | 0.056764 |
| 6 h | 0.035399 | 0.020712 | 0.094714 | 0.061088 | 0.097657 | 0.076961 | |
| 12 h | 0.051464 | 0.045255 | 0.117277 | 0.073159 | 0.118236 | 0.089099 | |
| 15 h | 0.058828 | 0.039719 | 0.127520 | 0.079590 | 0.114797 | 0.076361 | |
| P | 3 h | 0.348034 | 0.680361 | 0.411050 | 1.509773 | 0.370699 | 0.713138 |
| 6 h | 0.399553 | 0.617684 | 0.486592 | 0.718194 | 0.425397 | 0.704196 | |
| 12 h | 0.511262 | 0.851207 | 0.706403 | 1.150406 | 0.535496 | 0.980554 | |
| 15 h | 0.453258 | 0.597555 | 0.618186 | 0.755660 | 0.469612 | 0.698131 | |
| SWC | 3 h | 0.012302 | 0.029214 | 0.095440 | 0.075541 | 0.106073 | 0.074970 |
| 6 h | 0.018203 | 0.030712 | 0.150918 | 0.108401 | 0.153282 | 0.106928 | |
| 12 h | 0.027391 | 0.036781 | 0.245083 | 0.160379 | 0.246170 | 0.215060 | |
| 15 h | 0.033106 | 0.041433 | 0.282246 | 0.227894 | 0.278409 | 0.208012 | |
| TS | 3 h | 0.023765 | 0.017708 | 0.102470 | 0.082133 | 0.047228 | 0.037464 |
| 6 h | 0.047910 | 0.048379 | 0.163837 | 0.105534 | 0.075711 | 0.073902 | |
| 12 h | 0.082818 | 0.086185 | 0.230788 | 0.157936 | 0.121944 | 0.134360 | |
| 15 h | 0.116737 | 0.128579 | 0.270524 | 0.190755 | 0.153716 | 0.160674 | |
Extra results
Standard deviations
hai_std = hai.std().to_frame(name='std')
hai_std.index.name = "Variable"
hai_std = hai_std.reset_index().assign(unit=[f"\\si{{{unit}}}" for unit in units_big.values()])hai_std| Variable | std | unit | |
|---|---|---|---|
| 0 | TA | 7.924611 | \si{°C} |
| 1 | SW_IN | 204.002561 | \si{W m-2} |
| 2 | LW_IN | 41.955741 | \si{hPa} |
| 3 | VPD | 4.368417 | \si{hPa} |
| 4 | WS | 1.625425 | \si{mm} |
| 5 | PA | 0.855159 | \si{m s-1} |
| 6 | P | 0.280276 | \si{W m-2} |
| 7 | SWC | 8.913119 | \si{%} |
| 8 | TS | 5.658643 | \si{°C} |
latex = hai_std.style.hide(axis="index").format(precision=3).to_latex(hrules=True, caption="\\CapStd", label="tbl:hai_std", position_float="centering")
with open(base_path_tbl / "hai_std.tex", 'w') as f:
f.write(latex)Gap distribution
out_dir = here("../fluxnet/gap_stat")site_info = pl.read_parquet(out_dir / "../site_info.parquet").select([
pl.col("start").cast(pl.Utf8).str.strptime(pl.Datetime, "%Y%m%d%H%M"),
pl.col("end").cast(pl.Utf8).str.strptime(pl.Datetime, "%Y%m%d%H%M"),
pl.col("site").cast(pl.Categorical).sort()
])def duration_n_obs(duration):
"converts a duration into a n of fluxnet observations"
return abs(int(duration.total_seconds() / (30 * 60)))files = out_dir.ls()
files.sort() # need to sort to match the site_info
sites = []
for i, path in enumerate(files):
sites.append(pl.scan_parquet(path).with_columns([
pl.lit(site_info[i, "site"]).alias("site"),
pl.lit(duration_n_obs(site_info[i, "start"] - site_info[i, "end"])).alias("total_obs"),
pl.col("TIMESTAMP_END").cast(pl.Utf8).str.strptime(pl.Datetime, "%Y%m%d%H%M").alias("end"),
]).drop("TIMESTAMP_END"))
gap_stat = pl.concat(sites)pl.read_parquet(files[0])
shape: (31574, 3)
TIMESTAMP_END
gap_len
variable
i64
u32
str
200901010030
16992
"TA_F_MDS_QC"
200912211100
5
"TA_F_MDS_QC"
200912211700
1
"TA_F_MDS_QC"
201001061300
1
"TA_F_MDS_QC"
201001071300
3
"TA_F_MDS_QC"
201001081130
2
"TA_F_MDS_QC"
201001081830
1
"TA_F_MDS_QC"
201001091000
2
"TA_F_MDS_QC"
201001191000
1
"TA_F_MDS_QC"
201001191130
2
"TA_F_MDS_QC"
201001291900
1
"TA_F_MDS_QC"
201002131030
2
"TA_F_MDS_QC"
...
...
...
201103241630
3
"NEE_VUT_95_QC"
201103241930
29
"NEE_VUT_95_QC"
201103251900
8
"NEE_VUT_95_QC"
201103260030
1
"NEE_VUT_95_QC"
201103260200
1
"NEE_VUT_95_QC"
201103260300
1
"NEE_VUT_95_QC"
201103261230
1
"NEE_VUT_95_QC"
201103261330
2
"NEE_VUT_95_QC"
201103261630
1
"NEE_VUT_95_QC"
201103261930
2
"NEE_VUT_95_QC"
201103262100
1
"NEE_VUT_95_QC"
201103270000
13441
"NEE_VUT_95_QC"
gap_stat.head().collect()
shape: (5, 5)
gap_len
variable
site
total_obs
end
u32
str
str
i32
datetime[μs]
16992
"TA_F_MDS_QC"
"AR-SLu"
52559
2009-01-01 00:30:00
5
"TA_F_MDS_QC"
"AR-SLu"
52559
2009-12-21 11:00:00
1
"TA_F_MDS_QC"
"AR-SLu"
52559
2009-12-21 17:00:00
1
"TA_F_MDS_QC"
"AR-SLu"
52559
2010-01-06 13:00:00
3
"TA_F_MDS_QC"
"AR-SLu"
52559
2010-01-07 13:00:00
def plot_var_dist(var, small=False, ax=None):
if ax is None: ax = get_grid(1)[0]
ta_gaps = gap_stat.filter(
(pl.col("variable") == var)
).filter(
pl.col("gap_len") < 200 if small else True
).with_column(pl.col("gap_len") / (24 *2 * 7)).collect().to_pandas().hist("gap_len", bins=50, ax=ax)
ax.set_title(f"{var} - { 'gaps < 200' if small else 'all gaps'}")
if not small: ax.set_yscale('log')
ax.set_xlabel("gap length (weeks)")
ax.set_ylabel(f"{'Log' if not small else ''} n gaps")
# plt.xscale('log') plot_var_dist('TA_F_QC')
color_map = dict(zip(scale_meteo.domain, list(sns.color_palette('Set2', n_colors=len(hai.columns)).as_hex())))qc_map = {
'TA': 'TA_F_QC',
'SW_IN': 'SW_IN_F_QC',
'LW_IN': 'LW_IN_F_QC',
'VPD': 'VPD_F_QC',
'WS': 'WS_F_QC',
'PA': 'PA_F_QC',
'P': 'P_F_QC',
'TS': 'TS_F_MDS_1_QC',
'SWC': 'SWC_F_MDS_1_QC',
}def pl_in(col, values):
expr = False
for val in values:
expr |= pl.col(col) == val
return exprgap_stat.filter(pl_in('variable', qc_map.values())
).with_columns([
pl.when(pl.col("gap_len") < 48*7).then(True).otherwise(False).alias("short"),
pl.count().alias("total"),
pl.count().alias("total len"),
]).groupby("short").agg([
(pl.col("gap_len").count() / pl.col("total")).alias("frac_num"),
(pl.col("gap_len").sum() / pl.col("total len")).alias("frac_len")
]).collect()
shape: (2, 3)
short
frac_num
frac_len
bool
list[f64]
list[f64]
false
[0.010775, 0.010775, ... 0.010775]
[63.844386, 63.844386, ... 63.844386]
true
[0.989225, 0.989225, ... 0.989225]
[7.261655, 7.261655, ... 7.261655]
gap_stat.filter(pl_in('variable', qc_map.values())
).with_columns([
pl.when(pl.col("gap_len") < 48*7).then(True).otherwise(False).alias("short"),
pl.count().alias("total"),
pl.count().alias("total len"),
]).groupby("short").agg([
(pl.col("gap_len").count() / pl.col("total")).alias("frac_num"),
(pl.col("gap_len").sum() / pl.col("total len")).alias("frac_len")
]).collect()
shape: (2, 3)
short
frac_num
frac_len
bool
list[f64]
list[f64]
false
[0.010775, 0.010775, ... 0.010775]
[63.844386, 63.844386, ... 63.844386]
true
[0.989225, 0.989225, ... 0.989225]
[7.261655, 7.261655, ... 7.261655]
frac_miss = gap_stat.filter(
pl_in('variable', qc_map.values())
).groupby(["site", "variable"]).agg([
pl.col("gap_len").mean().alias("mean"),
(pl.col("gap_len").sum() / pl.col("total_obs").first()).alias("frac_gap")
])frac_miss.groupby('variable').agg([
pl.col("frac_gap").max().alias("max"),
pl.col("frac_gap").min().alias("min"),
pl.col("frac_gap").std().alias("std"),
pl.col("frac_gap").mean().alias("mean"),
]).collect()
shape: (9, 5)
variable
max
min
std
mean
str
f64
f64
f64
f64
"TA_F_QC"
2.299027
0.000011
0.256043
0.198846
"TS_F_MDS_1_QC"
3.090642
0.000011
0.38542
0.282106
"P_F_QC"
1.503296
0.000011
0.330422
0.311598
"VPD_F_QC"
2.298799
0.000011
0.297668
0.245457
"PA_F_QC"
2.675381
0.000011
0.420542
0.387658
"LW_IN_F_QC"
7.074603
0.000019
0.704469
0.578246
"SWC_F_MDS_1_QC...
1.487143
0.000011
0.29985
0.314847
"WS_F_QC"
2.675324
0.000798
0.323944
0.257303
"SW_IN_F_QC"
2.193984
0.000038
0.21952
0.16772
frac_miss.sort("frac_gap", reverse=True).collect()
shape: (1798, 4)
site
variable
mean
frac_gap
str
str
f64
f64
"US-LWW"
"LW_IN_F_QC"
11267.590909
7.074603
"US-Wi5"
"LW_IN_F_QC"
70128.0
3.992031
"ES-LgS"
"LW_IN_F_QC"
175344.0
3.333093
"US-LWW"
"TS_F_MDS_1_QC"
1320.646341
3.090642
"US-Wi5"
"TS_F_MDS_1_QC"
62.142857
2.897194
"US-Wi0"
"LW_IN_F_QC"
1369.694444
2.814601
"US-ORv"
"PA_F_QC"
3.899334
2.675381
"US-ORv"
"WS_F_QC"
3.899251
2.675324
"US-LWW"
"PA_F_QC"
409.701754
2.665944
"US-Wi5"
"WS_F_QC"
39.57814
2.349861
"US-Wi5"
"PA_F_QC"
51.453972
2.322707
"US-Wi5"
"TA_F_QC"
45.790249
2.299027
...
...
...
...
"CN-Qia"
"P_F_QC"
1.0
0.000019
"CN-Din"
"TS_F_MDS_1_QC"
1.0
0.000019
"CN-Qia"
"TS_F_MDS_1_QC"
1.0
0.000019
"CN-Ha2"
"TS_F_MDS_1_QC"
1.0
0.000019
"CN-Cha"
"TA_F_QC"
1.0
0.000019
"CN-Din"
"PA_F_QC"
1.0
0.000019
"US-Me6"
"P_F_QC"
1.0
0.000011
"US-Me6"
"PA_F_QC"
1.0
0.000011
"US-Me6"
"SWC_F_MDS_1_QC...
1.0
0.000011
"US-Me6"
"VPD_F_QC"
1.0
0.000011
"US-Me6"
"TA_F_QC"
1.0
0.000011
"US-Me6"
"TS_F_MDS_1_QC"
1.0
0.000011
site_info.filter((pl.col("site") == "US-LWW"))
shape: (1, 3)
start
end
site
datetime[μs]
datetime[μs]
cat
1997-01-01 00:30:00
1999-01-01 00:00:00
"US-LWW"
gap_stat.filter((pl.col("site") == "US-LWW") & (pl.col("variable") == "LW_IN_F_QC" )).collect()
shape: (22, 5)
gap_len
variable
site
total_obs
end
u32
str
str
i32
datetime[μs]
246556
"LW_IN_F_QC"
"US-LWW"
35039
2000-01-01 00:30:00
19
"LW_IN_F_QC"
"US-LWW"
35039
2014-02-11 08:30:00
6
"LW_IN_F_QC"
"US-LWW"
35039
2014-03-08 08:00:00
1
"LW_IN_F_QC"
"US-LWW"
35039
2014-03-08 11:30:00
1
"LW_IN_F_QC"
"US-LWW"
35039
2014-03-08 13:00:00
50
"LW_IN_F_QC"
"US-LWW"
35039
2014-11-18 13:00:00
21
"LW_IN_F_QC"
"US-LWW"
35039
2014-11-19 22:30:00
22
"LW_IN_F_QC"
"US-LWW"
35039
2014-11-24 23:30:00
55
"LW_IN_F_QC"
"US-LWW"
35039
2014-11-26 06:30:00
172
"LW_IN_F_QC"
"US-LWW"
35039
2014-11-27 19:30:00
70
"LW_IN_F_QC"
"US-LWW"
35039
2014-12-01 23:00:00
3
"LW_IN_F_QC"
"US-LWW"
35039
2014-12-03 11:30:00
35
"LW_IN_F_QC"
"US-LWW"
35039
2014-12-03 16:30:00
92
"LW_IN_F_QC"
"US-LWW"
35039
2014-12-04 11:00:00
14
"LW_IN_F_QC"
"US-LWW"
35039
2014-12-11 05:00:00
36
"LW_IN_F_QC"
"US-LWW"
35039
2014-12-11 18:00:00
36
"LW_IN_F_QC"
"US-LWW"
35039
2014-12-12 17:30:00
87
"LW_IN_F_QC"
"US-LWW"
35039
2014-12-13 18:00:00
70
"LW_IN_F_QC"
"US-LWW"
35039
2014-12-16 01:30:00
37
"LW_IN_F_QC"
"US-LWW"
35039
2014-12-17 16:00:00
10
"LW_IN_F_QC"
"US-LWW"
35039
2014-12-21 07:30:00
494
"LW_IN_F_QC"
"US-LWW"
35039
2014-12-21 17:30:00
import matplotlibmatplotlib.rcParams.update({'font.size': 22})
sns.set_style("whitegrid")def plot_var_dist(var, ax=None, small=True):
if ax is None: ax = get_grid(1)[0]
color = color_map[var]
var_qc = qc_map[var]
ta_gaps = gap_stat.filter(
(pl.col("variable") == var_qc)
).filter(
pl.col("gap_len") < (24 * 2 *7) if small else True
).with_column(pl.col("gap_len") / (2 if small else 48 * 7)
).collect().to_pandas().hist("gap_len", bins=50, ax=ax, edgecolor="white", color=color)
ax.set_title(f"{var} - { 'gap length < 1 week' if small else 'all gaps'}")
ax.set_xlabel(f"Gap length ({ 'hour' if small else 'week'})")
ax.set_ylabel(f"Log n gaps")
ax.set_yscale('log')
plt.tight_layout()vars = gap_stat.select(pl.col("variable").unique()).collect()vars.filter(pl.col("variable").str.contains("TA"))
shape: (4, 1)
variable
str
"NEE_VUT_USTAR5...
"NEE_CUT_USTAR5...
"TA_F_QC"
"TA_F_MDS_QC"
for ax, var in zip(get_grid(9,3,3, figsize=(15,12), sharey=False), list(var_type.categories)):
plot_var_dist(var, ax=ax)
plt.savefig(base_path_img / "gap_len_dist_small.pdf")
for ax, var in zip(get_grid(9,3,3, figsize=(15,12), sharey=False), list(var_type.categories)):
plot_var_dist(var, ax=ax, small=False)
plt.savefig(base_path_img / "gap_len_dist.pdf")
Square Root Filter
Numerical Stability
from meteo_imp.kalman.performance import *err = cache_disk("fuzz_sr")(fuzz_filter_SR)(100, 110)p = plot_err_sr_filter(err)
save_show_plot(p, "numerical_stability")
Performance
@cache_disk("perf_sr")
def get_perf_sr():
reset_seed()
return perf_comb_params('filter', use_sr_filter=[True, False], rep=range(100)) perf1 = (get_perf_sr()
.groupby('use_sr_filter')
.agg(pl.col("time").mean())
.with_column(
pl.when(pl.col("use_sr_filter"))
.then(pl.lit("Square Root Filter"))
.otherwise(pl.lit("Standard Filter"))
.alias("Filter type")
))perf1
shape: (2, 3)
use_sr_filter
time
Filter type
bool
f64
str
false
0.237968
"Standard Filte...
true
0.231648
"Square Root Fi...
plot_perf_sr = alt.Chart(perf1.to_pandas()).mark_bar(size = 50).encode(
x=alt.X('Filter type', axis=alt.Axis(labelAngle=0)),
y=alt.Y('time', scale=alt.Scale(zero=False), title="time [s]"),
color=alt.Color('Filter type',
scale = alt.Scale(scheme = 'accent'))
).properties(width=300)save_show_plot(plot_perf_sr, "perf_sr")